{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"Script for plotting a bar chart with types of variable stars in\n",
    "the current version of the General Catalog of Variable Stars (GCVS).\n",
    "Data sources:\n",
    "http://www.sai.msu.su/gcvs/gcvs/gcvs5/gcvs5.txt\n",
    "https://cdsarc.cds.unistra.fr/viz-bin/cat/B/vsx\n",
    "\n",
    "According to GCVS Variability Types description,\n",
    "http://www.sai.msu.su/gcvs/gcvs/vartype.htm\n",
    "if a variable belongs to several types of variability, the types are joined\n",
    "in the data field by a \"+\" sign, e.g., E+UG, UV+BY.\n",
    "Multiple classifications for object types are separated by a solidus (\"/\").\n",
    "We collect them separatly in additional data arrays.\n",
    "Uncertainty on type of variability marked with a colon (:) is discarded for simplicity.\n",
    "\n",
    "According to variable star type designations in vsx,\n",
    "https://www.aavso.org/vsx/index.php?view=about.vartypes\n",
    "A colon (:) after the variability type -or any other field- means\n",
    "the value/classification is uncertain.\n",
    "A pipe character (|) between two different types signifies a logical OR;\n",
    "the classification is uncertain and all possible types are indicated.\n",
    "An example of this is ELL|DSCT, where the star may be an ellipsoidal binary system\n",
    "or a DSCT-type pulsating variable with half the given period.\n",
    "A plus character (+) signifies a logical AND; two different variability types\n",
    "are seen in the same star or system. An example of this would be ELL+DSCT, where\n",
    "one of the components of an ellipsoidal binary system is a DSCT-type pulsating variable.\n",
    "A slash character (/) indicates a subtype. In the case of binary systems (eclipsing,\n",
    "ellipsoidal or reflection variables) it is used to help describe either the physical\n",
    "properties of the system (E/PN or EA/RS), the luminosity class of the components (EA/DM),\n",
    "or the degree of filling of their inner Roche lobes (EA/SD).\n",
    "This is the GCVS classification system. In cataclysmic variables, slash characters\n",
    "are used to indicate some properties of the system, as in the degree of polarization\n",
    "(NA/DQ) or the nature of their components (UG/IBWD).\n",
    "\"\"\"\n",
    "\n",
    "\n",
    "import os\n",
    "\n",
    "from scour import scour\n",
    "import matplotlib.pyplot as plt\n",
    "from matplotlib.ticker import MultipleLocator\n",
    "import pandas as pd\n",
    "\n",
    "\n",
    "def optimize_svg(tmp_path, path):\n",
    "    \"\"\"Optimize svg file using scour\"\"\"\n",
    "    with open(tmp_path, \"rb\") as inputfile, open(path, \"wb\") as outputfile:\n",
    "        options = scour.generateDefaultOptions()\n",
    "        options.enable_viewboxing = True\n",
    "        options.strip_comments = True\n",
    "        options.strip_ids = True\n",
    "        options.remove_metadata = True\n",
    "        options.shorten_ids = True\n",
    "        options.indent_type = \"none\"\n",
    "        options.newlines = False\n",
    "        scour.start(options, inputfile, outputfile)\n",
    "\n",
    "\n",
    "def fill_dct(dct, typ):\n",
    "    \"\"\"Fill dictionary with given type name.\"\"\"\n",
    "    try:\n",
    "        dct[typ] += 1\n",
    "    except KeyError:\n",
    "        dct[typ] = 1\n",
    "\n",
    "\n",
    "types_dct = {}\n",
    "types_plus = {}\n",
    "types_slash = {}\n",
    "STRIP = True\n",
    "\n",
    "\"\"\"Read GCVS file, get each type of variable star, count them,\n",
    "merge with uncertainly defined types if STRIP == True, collect in dictionaries.\n",
    "\"\"\"\n",
    "with open(\"../../../data/gcvs/gcvs5.txt\", encoding=\"ascii\") as gcvs:\n",
    "    cat = gcvs.readlines()\n",
    "    for line in cat:\n",
    "        typ = line[41:51].strip()\n",
    "        if STRIP:\n",
    "            typ = typ.strip(\":\")\n",
    "        if \"+\" in typ:\n",
    "            for typsplit in typ.split(\"+\"):\n",
    "                if STRIP:\n",
    "                    typsplit = typsplit.strip(\":\")\n",
    "                fill_dct(types_plus, typsplit)\n",
    "        if \"/\" in typ:\n",
    "            for typsplit in typ.split(\"/\"):\n",
    "                if STRIP:\n",
    "                    typsplit = typsplit.strip(\":\")\n",
    "                fill_dct(types_slash, typsplit)\n",
    "        fill_dct(types_dct, typ)\n",
    "\n",
    "# df_gcvs = pd.DataFrame({\n",
    "#         \"gcvs\": pd.Series(types_dct),\n",
    "#         \"+\": pd.Series(types_plus),\n",
    "#         \"/\": pd.Series(types_slash),\n",
    "#     }).fillna(0).sort_values(by=\"gcvs\")[NUM:]\n",
    "gcvs = pd.Series(types_dct).sort_values(ascending=False)\n",
    "\n",
    "vsx_data_raw = pd.read_csv(\"../../../data/vsx/vsx_csv.dat\", usecols=[\"Type\"])\n",
    "vsx_types = vsx_data_raw[\"Type\"].str.strip(\":\")\n",
    "vsx = vsx_types.squeeze().value_counts()\n"
   ]},{
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "GCVS sorted:\n",
      "         vsx    gcvs\n",
      "EB     29270   55142\n",
      "RR      1888   63560\n",
      "RRC    39838   65235\n",
      "SRB     4007  110635\n",
      "SR    327214  123963\n",
      "EW    404075  134953\n",
      "LB      3641  153931\n",
      "EA    101690  157984\n",
      "RRAB   99254  278401\n",
      "M      82446  356185\n",
      "VSX sorted:\n",
      "         vsx    gcvs\n",
      "BY     86947   48790\n",
      "EC     94350      77\n",
      "ROT    98141       0\n",
      "RRAB   99254  278401\n",
      "EA    101690  157984\n",
      "SRS   119308    4364\n",
      "VAR   147885      77\n",
      "SR    327214  123963\n",
      "E     338648   29071\n",
      "EW    404075  134953\n"
     ]}, {
     "data": {
      "text/plain": []
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "K = 38.97 # 2268196/58202\n",
    "df = pd.DataFrame({\"vsx\": vsx,\n",
    "                     \"gcvs\": gcvs * K}).fillna(0).astype(int)\n",
    "\n",
    "NUM = -39\n",
    "df_gcvs = df.sort_values(by=\"gcvs\")[NUM:]\n",
    "df_vsx = df.sort_values(by=\"vsx\")[NUM:]\n",
    "print(\"GCVS sorted:\")\n",
    "print(df_gcvs[-10:])\n",
    "print(\"VSX sorted:\")\n",
    "print(df_vsx[-10:])\n",
    "\n",
    "ax = df_gcvs.plot.bar(figsize=(16, 9), width=0.88, rot=45)\n",
    "# ax = df_vsx.plot.bar(figsize=(16, 9), width=0.88, rot=45)\n",
    "ax.legend([\n",
    "    \"Типы переменных звезд VSX\",\n",
    "    \"Типы переменных звезд ОКПЗ x39\",\n",
    "    ], fontsize=12, loc=\"upper left\")\n",
    "\n",
    "plt.subplots_adjust(left=0.051, bottom=0.102, right=0.985, top=0.955)\n",
    "plt.xlabel(\"Типы переменных звезд\", fontsize=14)\n",
    "plt.ylabel(\"Количество переменных звезд\", fontsize=14, labelpad=0)\n",
    "plt.title(\"Распределение по типам переменных звезд в VSX и ОКПЗ, \"\n",
    "    + f\"{sum(vsx)} и {sum(gcvs)} объектов. Июнь 2023 года\",\n",
    "    fontsize=15)\n",
    "\n",
    "FILE_EXT = \"png\"\n",
    "PLT_PTH = f\"../../../plots/stars/var_types_distribution-gcvs-sorted\"\n",
    "# PLT_PTH = f\"../../../plots/stars/var_types_distribution-vsx-sorted\"\n",
    "tmp_pth = f\"{PLT_PTH}_.{FILE_EXT}\"\n",
    "pth = f\"{PLT_PTH}.{FILE_EXT}\"\n",
    "plt.savefig(tmp_pth, dpi=120)\n",
    "if FILE_EXT == \"svg\":\n",
    "    optimize_svg(tmp_pth, pth)\n",
    "    os.remove(tmp_pth)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.3"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
